In [45]:
import arff
import math
#loading dataset
df=arff.load(open('attachments/trainProdSelection/trainProdSelection.arff','rb'))
train=df['data']
In [46]:
#shuffling dataset and splitting into training and testing sets in the ratio 3:1
from random import shuffle
shuffle(train)
sp=int(0.25*(len(train)))
test = train[:sp]
train=train[sp:]
In [47]:
#separating all the attributes in the dataset
typ=[]
ls=[]
vac=[]
ec=[]
sal=[]
prp=[]
lab=[]
for i in range(len(train)):
typ.append(train[i][0])
ls.append(train[i][1])
vac.append(train[i][2])
ec.append(train[i][3])
sal.append(train[i][4])
prp.append(train[i][5])
lab.append(train[i][6])
In [48]:
#normalizing numeric values of train data by finding the minumum and maxmimum values in the list and
#applying the formula (actual-minimum)/(maximum-minimum)
maxx=max(vac)
minn=min(vac)
for i in range(len(vac)):
vac[i]=(vac[i]-minn)/(maxx-minn)
maxx=max(ec)
minn=min(ec)
for i in range(len(ec)):
ec[i]=(ec[i]-minn)/(maxx-minn)
maxx=max(sal)
minn=min(sal)
for i in range(len(sal)):
sal[i]=(sal[i]-minn)/(maxx-minn)
maxx=max(prp)
minn=min(prp)
for i in range(len(prp)):
prp[i]=(prp[i]-minn)/(maxx-minn)
In [49]:
#separating all the attrbutes and normalizing numeric values of test data
typ1=[]
ls1=[]
vac1=[]
ec1=[]
sal1=[]
prp1=[]
lab1=[]
for i in range(len(test)):
typ1.append(test[i][0])
ls1.append(test[i][1])
vac1.append(test[i][2])
ec1.append(test[i][3])
sal1.append(test[i][4])
prp1.append(test[i][5])
lab1.append(test[i][6])
maxx=max(vac1)
minn=min(vac1)
for i in range(len(vac1)):
vac1[i]=(vac1[i]-minn)/(maxx-minn)
maxx=max(ec1)
minn=min(ec1)
for i in range(len(ec1)):
ec1[i]=(ec1[i]-minn)/(maxx-minn)
maxx=max(sal1)
minn=min(sal1)
for i in range(len(sal1)):
sal1[i]=(sal1[i]-minn)/(maxx-minn)
maxx=max(prp1)
minn=min(prp1)
for i in range(len(prp1)):
prp1[i]=(prp1[i]-minn)/(maxx-minn)
In [50]:
#taking each sample from test data and finding the inverse euclidian distance to all the training samples and taking top three
#after sorting the training data in descending order based on the distance vector
output=[]
for i in range(len(test)):
list1=[]
for j in range(0,len(train)):
v=((vac1[i]-vac[j])**2)+((ec1[i]-ec[j])**2)+((sal1[i]-sal[j])**2)+((prp1[i]-prp[j])**2)
if(typ1[i]!=typ[j]):
v=v+1;
if(ls1[i]!=ls[j]):
v=v+1;
eScore=1/math.sqrt(v)
list2=[]
list2.append(eScore)
list2.append(lab[j])
list1.append(list2)
list1=sorted(list1,key=lambda x:x[0])
temp=[0,0,0,0,0]
for j in range(len(list1)-3,len(list1)):
temp[int(list1[j][1][1])-1]=temp[int(list1[j][1][1])-1]+list1[i][0]
output.append(temp)
In [51]:
#performance metric
#taking the maximum value index, which is the classifer and calulating the number of true predictions
count=0
for i in range(len(output)):
maxx=output[i][0]
imax=0
for j in range(1,len(output[i])):
if(output[i][j]>maxx):
maxx=output[i][j]
imax=j
imax=imax+1
if(int(lab1[i][1])==imax):
count=count+1
print count
#calculating the acuuracy of the model by the formula (number of true predictions / total number of predictions)
acc=(count/(len(test)*1.0))*100
print acc
In [ ]: